In [1]:
import pandas as pd
import geopandas as gpd
import shapely.wkt
import urllib
import os
import numpy as np
In [2]:
# if running on Colab, uncomment and run this line below too:
# !pip install mapclassify
In [3]:
output_dir = "output/"
os.makedirs(output_dir, exist_ok=True)
In [4]:
# Functions
def get_all_organisations():
params = urllib.parse.urlencode({
"sql": f"""
select entity as organisation_entity, name, organisation, dataset, local_planning_authority, local_authority_district,
case when dataset = "local-authority" then local_authority_district else local_planning_authority end as statistical_geography
from organisation
""",
"_size": "max"
})
url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
df = pd.read_csv(url)
return df
def get_pdp_geo_dataset(dataset, underscore_cols=True, crs_out=27700):
url = f"https://files.planning.data.gov.uk/dataset/{dataset}.geojson"
gdf = gpd.read_file(url)
if underscore_cols:
gdf.columns = [x.replace("-", "_") for x in gdf.columns]
gdf.set_crs(epsg=4326, inplace=True)
gdf.to_crs(epsg=crs_out, inplace=True)
return gdf
Data in¶
In [5]:
# get org lookup
org_df = get_all_organisations()
print(len(org_df))
452
In [6]:
# read in manual count sheet
con_count_df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSGZIudsGx0ez4cU-4wSvymvXIFfpDb_qfbS3uW5RiuBkJrJQ9D8k0HBUPtgncRXA/pub?gid=485605871&single=true&output=csv")
con_count_df.columns = [x.replace("-", "_") for x in con_count_df.columns]
# join on organisation names and LPA codes
con_count_lpa_df = con_count_df.merge(
org_df[["organisation_entity", "name", "local_planning_authority"]],
how = "left",
on = "organisation_entity"
)
print(len(con_count_lpa_df))
# con_count_lpa_df.head()
328
In [7]:
# CA from pdp
ca_df = pd.read_csv("https://files.planning.data.gov.uk/dataset/conservation-area.csv",
usecols = ["entity", "name", "organisation-entity", "reference", "entry-date", "point"])
ca_df.columns = [x.replace("-", "_") for x in ca_df.columns]
# load to gdf
ca_df["point"] = ca_df["point"].apply(shapely.wkt.loads)
ca_gdf = gpd.GeoDataFrame(ca_df, geometry='point')
# Transform to ESPG:27700 for more interpretable area units
ca_gdf.set_crs(epsg=4326, inplace=True)
ca_gdf.to_crs(epsg=27700, inplace=True)
In [8]:
# Latest ONS LPA file, for flagging whether pdp LPAs are 2023 or not
ons_lpa_gpd = gpd.read_file("https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/Local_Planning_Authorities_April_2023_Boundaries_UK_BGC/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson",)
print(len(ons_lpa_gpd))
# ons_lpa_gpd.head()
379
In [9]:
# LPA boundaries from PDP site
lpa_gdf = get_pdp_geo_dataset("local-planning-authority")
lpa_gdf["lpa_2023"] = np.where(lpa_gdf["reference"].isin(ons_lpa_gpd["LPA23CD"]), True, False)
lpa_gdf.rename(columns={'name':'lpa_name'}, inplace=True)
print(len(lpa_gdf))
# lpa_gdf.head()
337
Analysis¶
Spatial joining¶
In [10]:
# join LPAs to all conservation areas, then join on the names of supplying organisations for matching conservation areas
lpa_ca_join = gpd.sjoin(
lpa_gdf[["reference", "lpa_name", "lpa_2023", "geometry"]],
ca_gdf[["entity", "organisation_entity", "point"]],
how = "left",
predicate = "intersects"
).merge(
org_df[["organisation_entity", "name"]],
how = "left",
on = "organisation_entity"
)
lpa_ca_join["name"] = lpa_ca_join["name"].astype(str)
print(len(lpa_ca_join))
# lpa_ca_join.head()
9275
In [11]:
# flag the providing org type - ranking so when we group and count we can count areas with LPA and Historic England providing as LPA
lpa_ca_join["org_type_rank"] = np.select(
[
(lpa_ca_join["organisation_entity"] != 16) & (lpa_ca_join["organisation_entity"].notnull()),
lpa_ca_join["organisation_entity"] == 16
],
[1, 2],
default = 3)
# lpa_ca_join.head()
In [12]:
# count no. of conservation areas per LPA then join on the manual counts
lpa_ca_join_count = lpa_ca_join.groupby(
["reference", "lpa_name", "lpa_2023"]
).agg(
{"entity" : "count",
"name" : lambda x: ', '.join(set(x)),
"org_type_rank" : "min"}
).reset_index(
).merge(
con_count_lpa_df[["local_planning_authority", "name", "conservation_area_count"]],
how = "left",
left_on = "reference",
right_on = "local_planning_authority"
)
# rename cols
lpa_ca_join_count.rename(columns=
{"entity":"count_platform",
"name_x":"platform_data_providers",
"conservation_area_count":"count_manual",
"name_y":"lpa_name_manual"}, inplace = True)
# calculate count comparison delta
lpa_ca_join_count["count_delta"] = (lpa_ca_join_count["count_platform"] - lpa_ca_join_count["count_manual"]) / lpa_ca_join_count["count_manual"]
lpa_ca_join_count["count_delta_abs"] = abs(lpa_ca_join_count["count_delta"])
# use org type rank to flag the best provider for an area
lpa_ca_join_count["provider_org_type"] = lpa_ca_join_count["org_type_rank"].map({1:"LPA", 2:"Historic England", 3:"None"})
# write to csv
lpa_ca_join_count.to_csv(os.path.join(output_dir, "LPA_conservation_area_count_comparison.csv"), index = False)
lpa_ca_join_count.head()
Out[12]:
| reference | lpa_name | lpa_2023 | count_platform | platform_data_providers | org_type_rank | local_planning_authority | lpa_name_manual | count_manual | count_delta | count_delta_abs | provider_org_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | E60000001 | County Durham LPA | True | 92 | Historic England | 2 | E60000001 | Durham County Council | 93.0 | -0.010753 | 0.010753 | Historic England |
| 1 | E60000002 | Darlington LPA | True | 17 | Historic England | 2 | E60000002 | Darlington Borough Council | 17.0 | 0.000000 | 0.000000 | Historic England |
| 2 | E60000003 | Hartlepool LPA | True | 8 | Historic England | 2 | E60000003 | Hartlepool Borough Council | 8.0 | 0.000000 | 0.000000 | Historic England |
| 3 | E60000004 | Middlesbrough LPA | True | 8 | Historic England | 2 | E60000004 | Middlesbrough Borough Council | 8.0 | 0.000000 | 0.000000 | Historic England |
| 4 | E60000005 | Northumberland LPA | True | 68 | Historic England | 2 | E60000005 | Northumberland County Council | 70.0 | -0.028571 | 0.028571 | Historic England |
Get single LPA layers¶
Where we have manual CA counts from organisations which are now technically "retired" LPAs (i.e. replaced by a newer LPA), it indicates that the data is still divided and provided by these historic orgs. In these cases we don't want to show the new 2023 LPA on the map at the same time as it overlaps and is confusing as we haven't technically collected data from this new org.
So we want to find the new 2023 LPAs which sit over retired LPAs that have supplied us with data, so we can remove them from the map and get a single contiguous layer which is a mix of historic and current LPA boundaries.
In [13]:
# create gdf of the match counts for all LPAs
lpa_ca_join_count_gdf = lpa_gdf[["reference", "geometry"]].merge(
lpa_ca_join_count[["reference", "lpa_name", "lpa_2023", "provider_org_type", "platform_data_providers", "count_platform", "count_manual", "count_delta"]],
how = "left",
on = "reference"
)
In [14]:
# old lpas = those which are not a 2023 boundary and we have data on the platform for
old_lpas = lpa_ca_join_count_gdf[(lpa_ca_join_count_gdf["lpa_2023"] == False) & (lpa_ca_join_count_gdf["count_manual"].notnull())]
# buffer the boundaries of new 2023 lpas a bit, so we can find which old ones are contained within them
buffered_new_lpas = lpa_ca_join_count_gdf[(lpa_ca_join_count_gdf["lpa_2023"] == True)][["reference", "lpa_name", "geometry"]].copy()
buffered_new_lpas["geometry"] = buffered_new_lpas["geometry"].buffer(100)
# new 2023 lpas to flag are those which have an "old" lpa within them
new_lpas = gpd.sjoin(
old_lpas[["reference", "lpa_name", "lpa_2023", "geometry"]],
buffered_new_lpas,
how = "inner",
predicate = "within"
)
In [15]:
# create a new flag in lpa gpd for whether or not LPAs are included in the historic & new combo single layer
old_lpas_incl_list = old_lpas["reference"].drop_duplicates().values
new_lpas_excl_list = new_lpas["reference_right"].drop_duplicates().values
lpa_ca_join_count_gdf["old_lpa_combo_display"] = np.select(
[
# is in old exclude list - show
lpa_ca_join_count_gdf["reference"].isin(old_lpas_incl_list),
# is new and not in the new exclude list - show
(lpa_ca_join_count_gdf["lpa_2023"] == True) & (~lpa_ca_join_count_gdf["reference"].isin(new_lpas_excl_list))
],
[True, True],
default = False
)
In [16]:
# bin the count comparison delta
bins = [-np.inf, -0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3, 0.4, 0.5, np.inf]
labels = ["< -50%", "-50% : -40%", "-40% : -30%", "-30% : -20%", "-20% : -10%", "-10% : 0", "0 : +10%", "+10% : +20%", "+20% : +30%", "+30% : +40%", "+40% : +50%", "> +50% "]
lpa_ca_join_count_gdf["count_delta_bins"] = pd.cut(lpa_ca_join_count_gdf["count_delta"], bins = bins, labels = labels)
In [17]:
# simplify geometry for smaller maps (helps running in colab)
lpa_ca_join_count_gdf["geometry"] = lpa_ca_join_count_gdf["geometry"].simplify(100)
In [21]:
def org_type_colormap(value): # scalar value defined in 'column'
if value == "LPA":
return "#28A197"
if value == "Historic England":
return "#12436D"
return "grey"
lpa_ca_join_count_gdf[lpa_ca_join_count_gdf["old_lpa_combo_display"] == True].explore(
tiles = "CartoDB positron",
column = "provider_org_type",
tooltip = ["reference", "lpa_name", "platform_data_providers"],
cmap = ["#1d70b8", "#28A197", "white"],
style_kwds=dict(color="black", weight = 1))
Out[21]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [22]:
# gdf for matches
count_comp_equal_gdf = lpa_ca_join_count_gdf[
(lpa_ca_join_count_gdf["old_lpa_combo_display"] == True) &
(lpa_ca_join_count_gdf["count_platform"] == lpa_ca_join_count_gdf["count_manual"])].copy()
count_comp_equal_gdf["count_comparison"] = "match"
# gdf for mis-matches
count_comp_gdf = lpa_ca_join_count_gdf[
(lpa_ca_join_count_gdf["old_lpa_combo_display"] == True) &
(lpa_ca_join_count_gdf["count_platform"] > 0) &
(lpa_ca_join_count_gdf["count_platform"] != lpa_ca_join_count_gdf["count_manual"])].copy()
Conservation area count comparison: matches¶
In [23]:
# show areas where the site count vs. manual count matches
count_comp_equal_gdf.explore(
tiles = "CartoDB positron",
color = "green",
tooltip = ["reference", "lpa_name", "platform_data_providers", "count_platform", "count_manual", "count_comparison"],
style_kwds=dict(color="black", weight = 1, fillOpacity = 0.3)
)
Out[23]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Conservation area count comparison: mis-matches¶
In [24]:
# show map of areas where there are mis-matches between site and manual CA count
count_comp_gdf.explore(
tiles = "CartoDB positron",
column = "count_delta_bins",
tooltip = ["reference", "lpa_name", "platform_data_providers", "count_platform", "count_manual", "count_delta", "count_delta_bins"],
cmap = "coolwarm",
style_kwds=dict(color="black", weight = 1)
)
Out[24]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]: